6 Diagnostics and Analysis
6.1 Investigate high variance wells
nrow.source = function(df, facilityname, sourcename) {
stopifnot(length(sourcename)==1)
return(nrow(df %>% filter(well==facilityname, source==sourcename)))
}
well_summaries = outframe %>%
filter(facility %in% well_names, variable=="mf_pred") %>%
group_by(facility) %>%
summarise(mean = mean(value),
sd = sd(value),
n_test = nrow.source(regression_df, unique(facility),"Well Tests"),
n_pi = nrow.source(regression_df, unique(facility), "PI Database"),
use.test = ifelse(n_test>0, "Test data", "No test data"),
use.pi = ifelse(n_pi>0, "PI data", "No PI data")) %>%
arrange(desc(sd))
well_summaries$production.curve = ifelse(well_summaries$facility %in% liq_wells, "Production curve", "Time series")
fp_summaries = list(fp14 = well_summaries %>% filter(facility %in% flows_to(censor('fp14', 'fp'))),
fp15 = well_summaries %>% filter(facility %in% flows_to(censor('fp15', 'fp'))),
fp16 = well_summaries %>% filter(facility %in% flows_to(censor('fp16', 'fp'))))
for (fp in names(fp_summaries)) {
print(xtable(fp_summaries[[fp]] %>% select(-c(use.test, use.pi, production.curve)),
type = "latex",
caption=paste("Data methods feeding flash plant", censor(fp, 'fp')),
label=paste0("tab:well_summaries_", fp)),
table.placement = "H",
file = paste0("../_media/summaries_", fp, ".tex"))
}
n_summaries = well_summaries %>%
group_by(use.pi, use.test) %>%
count()
sourceplot = ggplot(well_summaries, aes(x=1, y=log(sd))) +
geom_boxplot(fill='steelblue') +
geom_label(data=n_summaries, aes(x=-Inf, y=-Inf, hjust=0, vjust=0, label=paste0("n=", n), family="Times New Roman"), label.size=0, fill='white') +
facet_grid(.~ use.pi + use.test) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(title="Differences in Production Error by Data Source", x="Production curve data source", y="log(standard deviation)")# +
# ggsave('../_media/error_source.png', width=24.7*0.5, height=6, units='cm')
ggplotly(sourceplot)sourcetab = well_summaries %>% select(facility, mean, sd, n_test, n_pi)
# print(xtable(sourcetab %>% head(), type = "latex",
# caption="Upon inspection of the wells with the most variance, there is no immediate cause for high variance. This requires further investigation.",
# label="tab:well_summaries"),
# table.placement = "h",
# file = "../_media/well_summaries.tex")
kable(cbind(sourcetab)) %>%
kable_styling() %>%
scroll_box(height = "400px")| facility | mean | sd | n_test | n_pi |
|---|---|---|---|---|
| wk65 | 102.2934749 | 109.2036524 | 2 | 0 |
| wk86 | 70.9098346 | 62.1805718 | 4 | 0 |
| wk28 | 88.4427565 | 51.5255323 | 26 | 0 |
| wk242 | 320.7698705 | 47.5436728 | 28 | 0 |
| wk27 | 167.8148213 | 35.4230481 | 36 | 0 |
| wk59 | 112.4056454 | 28.7084217 | 31 | 0 |
| wk81 | 62.0394664 | 26.9696698 | 24 | 0 |
| wk26b | 111.2556339 | 26.1285785 | 13 | 0 |
| wk116 | 78.2828876 | 25.9934521 | 10 | 0 |
| wk123 | 183.5375107 | 23.1481245 | 31 | 0 |
| wk76 | 123.1079479 | 22.4739026 | 31 | 0 |
| wk96 | 28.7114853 | 22.1074050 | 10 | 0 |
| wk72 | 153.2740928 | 20.2675648 | 26 | 0 |
| wk71 | 130.5979298 | 20.2435489 | 32 | 0 |
| wk67 | 93.0195776 | 19.3605826 | 26 | 0 |
| wk256 | 218.0857737 | 18.6133809 | 32 | 30 |
| wk268 | 88.5181131 | 17.7488203 | 27 | 30 |
| wk266 | 324.7101171 | 16.9025924 | 29 | 30 |
| wk264 | 394.3901555 | 15.7904385 | 34 | 30 |
| wk245 | 420.7193707 | 15.3968544 | 41 | 30 |
| wk26a | 118.4680639 | 14.4939735 | 16 | 0 |
| wk265 | 334.3693427 | 13.7743788 | 34 | 30 |
| wk267 | 306.3311541 | 13.7699431 | 31 | 30 |
| wk74 | 155.9695681 | 13.7300618 | 26 | 0 |
| wk83 | 133.4440544 | 13.4643671 | 26 | 0 |
| wk255 | 378.6293910 | 13.4585292 | 41 | 30 |
| wk250 | 23.7800237 | 12.8711906 | 0 | 30 |
| wk269 | 133.3360082 | 11.7048139 | 25 | 30 |
| wk55 | 63.4294064 | 10.9022814 | 47 | 0 |
| wk235 | 205.8764431 | 10.3997032 | 20 | 0 |
| wk262 | 632.6696669 | 9.9788955 | 36 | 30 |
| wk70 | 150.3092709 | 9.8630844 | 45 | 0 |
| wk244 | 196.6758263 | 9.6261239 | 37 | 30 |
| wk229 | 202.3362270 | 9.3875046 | 92 | 0 |
| wk124 | 253.0930871 | 9.1417281 | 31 | 0 |
| wk222 | 107.4783230 | 8.2663053 | 44 | 0 |
| wk243 | 374.5543382 | 8.0256282 | 52 | 30 |
| wk207 | 150.9524750 | 6.4657032 | 18 | 0 |
| wk247 | 233.2013668 | 6.4012979 | 48 | 30 |
| wk271 | 86.3065011 | 6.3809900 | 6 | 30 |
| wk239 | 132.9953467 | 5.7918576 | 18 | 0 |
| wk46 | 43.9109434 | 5.2722978 | 18 | 0 |
| wk260 | 295.9067337 | 4.8348543 | 33 | 30 |
| wk261 | 250.1655260 | 4.7207026 | 37 | 30 |
| wk253 | 269.6862272 | 4.5556241 | 32 | 30 |
| wk270 | 461.4276278 | 4.0426321 | 5 | 30 |
| wk258 | 198.7028243 | 3.5453950 | 38 | 30 |
| wk259 | 309.1449514 | 3.2888729 | 37 | 30 |
| wk263 | 221.5213149 | 3.2590841 | 34 | 30 |
| wk272 | 207.6220684 | 2.8044848 | 7 | 30 |
| wk254 | 226.6584657 | 2.4440172 | 39 | 30 |
| wk605 | 35.3459877 | 1.3452568 | 0 | 0 |
| wk237 | 14.1962301 | 1.0528268 | 0 | 30 |
| wk606 | 22.1277314 | 0.8462717 | 0 | 0 |
| wk233 | 13.1344091 | 0.5505461 | 0 | 30 |
| wk241 | 45.8576241 | 0.5211013 | 0 | 30 |
| wk238 | 60.1674907 | 0.3661648 | 0 | 30 |
| wk607 | 1.4918209 | 0.3595182 | 0 | 0 |
| wk249 | 1.2393797 | 0.3220994 | 0 | 0 |
| wk610 | 6.7519598 | 0.3090580 | 0 | 0 |
| wk251 | 22.7806092 | 0.2287621 | 0 | 30 |
| wk234 | 30.8827687 | 0.1998563 | 0 | 30 |
| wk25 | 6.7878732 | 0.1874126 | 0 | 0 |
| wk240 | 23.5225646 | 0.1584181 | 0 | 30 |
| wk232 | 0.1900650 | 0.1415838 | 0 | 30 |
| wk228 | 15.0023497 | 0.1067671 | 0 | 30 |
| wk236 | 26.3313337 | 0.0811007 | 0 | 0 |
| wk216 | 8.1389004 | 0.0778926 | 0 | 0 |
| wk252 | 58.9341342 | 0.0641691 | 0 | 30 |
| wk118 | 9.5062240 | 0.0327287 | 0 | 0 |
| wk604 | 0.7711713 | 0.0280654 | 0 | 0 |
| wk66 | 0.0008232 | 0.0008350 | 0 | 0 |
| wk92 | 0.0003939 | 0.0003999 | 0 | 0 |
| wk101 | 0.0003719 | 0.0003774 | 0 | 0 |
| wk88 | 0.0000000 | 0.0000000 | 0 | 0 |
6.2 Regression Fits
prod = as.data.frame(outmatrix) %>%
select(contains('prod')) %>%
gather(key=facility, value=value) %>%
mutate(which=parse_number(facility)) %>%
mutate(whp=data$whp_prod[which],
well = names(ids)[data$well_id_prod[which]]) %>%
rename(mf=value) %>%
group_by(well, whp) %>%
summarise(lower=quantile(mf, 0.025),
upper=quantile(mf, 0.975),
mean=mean(mf))
plotdata = regression_df %>%
filter(well_id %in% ids[production_curve_wells]) %>%
mutate(datetime = factor(as.Date(date))) %>%
mutate(source = factor(source, levels=c("Well Tests", "PI Database")))
# regression plot
regplot = ggplot(prod, aes(x=whp)) +
geom_line(aes(y=mean, color=well)) +
geom_ribbon(aes(ymin=lower, ymax=upper, fill=well), alpha=0.25) +
geom_point(data=plotdata, aes(y=mf, color=well, size=date, shape=source), alpha=0.5) +
labs(title="Linear Regression on Test and PI Data", x="Well-head pressure (bar)", y="Mass flow (T/h)", color="Well", shape="Data source", size="Date", fill="Well") +
coord_cartesian(xlim=c(min(plotdata$whp)*0.9,max(plotdata$whp)*1.1), ylim=c(0,max(plotdata$mf)*1.1))# +
# ggsave('../_media/production_curve.png', width=24.7*0.48, height=24.7*0.48, units='cm')
ggplotly(regplot)6.3 Time Series Plots
tsplotwells = ar_wells
ts_fit = as.data.frame(outmatrix) %>%
select(contains('mf_ts')) %>%
gather() %>%
mutate(index = parse_number(key)) %>% select(-key) %>%
group_by(index) %>%
summarise(lower=quantile(value, 0.025),
upper=quantile(value, 0.975),
mean=mean(value)) %>%
cbind(ts) %>%
mutate(well = factor(names(ids[well_id_ts])),
date_numeric = date_numeric_ts)
# actual observations
tsplotdata = dry_df %>%
filter(well_id %in% ids[tsplotwells]) %>%
mutate(datetime = factor(as.Date(date)),
facility = well)
# experimental AR1 time series
ar_fit = as.data.frame(outmatrix) %>%
select(contains("mu_ar")) %>%
gather() %>%
mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
select(facility, date_numeric, value) %>%
group_by(facility, date_numeric) %>%
summarise(mean=mean(value),
lower=quantile(value, 0.025),
upper=quantile(value, 0.975)) %>%
filter(facility %in% tsplotwells)
# experimental EMA time series
ewma_fit = as.data.frame(outmatrix) %>%
select(contains("mu_ema")) %>%
gather() %>%
mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
select(facility, date_numeric, value) %>%
group_by(facility, date_numeric) %>%
summarise(mean=mean(value),
lower=quantile(value, 0.025),
upper=quantile(value, 0.975)) %>%
filter(facility %in% tsplotwells)
# find plot limits
tsmax = max(c(ts_fit$upper, ar_fit$upper, ewma_fit$upper))
lintsplot = ggplot(ts_fit, aes(x=date_numeric, color=well, fill=well)) +
geom_line(aes(y=mean), linetype="dashed") +
geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.25) +
geom_line(data=tsplotdata, aes(y=mf)) +
geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
coord_cartesian(ylim=c(0, 60)) +
labs(title=paste("Linear Time Series Regression for Selected Wells in PI"), x="Days since baseline (2000)", linetype="")# +
# ggsave('../_media/dry_time_series.png', width=24.7, height=8, units='cm')
arplot = ggplot(ar_fit %>% filter(facility %in% tsplotwells), aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
geom_line(data=tsplotdata, aes(y=mf)) +
geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
labs(title="AR(1) Experiment", x="Days since first date", y="Mass flow (T/h)") +
ggsave('../_media/ar_experiment.png', width=24.7, height=8, units='cm')
ewmaplot = ggplot(ewma_fit, aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
geom_line(data=tsplotdata, aes(y=mf)) +
geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
labs(title="EWMA Experiment", x="Days since first date")# +
# ggsave('../_media/ewma_experiment.png', width=24.7, height=8, units='cm')
ggplotly(lintsplot)ggplotly(arplot)ggplotly(ewmaplot)tsgrob = grid_arrange_shared_legend(lintsplot, arplot, ewmaplot, nrow=3, ncol=1, position = "bottom")
tsgrob## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (2-2,1-1) arrange gtable[guide-box]
# ggsave('../_media/ts_experiment.png', tsgrob, width=24.7, height=24, units='cm')6.4 Goodness of fit (OLS regression)
liq_fit = as.data.frame(outmatrix) %>%
select(contains('mf_fit')) %>%
gather(key='index', value='fitted') %>%
mutate(index=as.integer(parse_number(index))) %>%
group_by(index) %>%
summarise(lower=quantile(fitted, 0.025),
upper=quantile(fitted, 0.975),
Fitted=mean(fitted),
std=sd(fitted)) %>%
cbind(regression_df) %>%
mutate(`Standardised residual` = (Fitted-mf)/std,
Well = factor(names(ids[well_id])),
Observed = mf) %>%
gather(key="key", value="value", `Standardised residual`, Observed) %>%
select(Well, key, Fitted, value, source)
diagplot = ggplot(liq_fit, aes(x=Fitted, y=value)) +
geom_point(aes(color=Well, shape=Well)) + scale_shape_manual(values = rep_len(1:25, length(unique(liq_fit$Well)))) +
geom_smooth(color='black') +
facet_wrap(~key, scales="free") +
geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
# coord_cartesian(ylim=c(-4, 4)) +
labs(title="Diagnostic Plots", x="Fitted mass flow (T/h)", y="") +
theme(legend.position = "bottom") +
guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T))# +
# ggsave('../_media/diagnostics.png', width=24.7, height=12, units='cm')
ggplotly(diagplot)## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
selectwells = liq_fit %>% group_by(Well, key) %>% summarise(fittedsd = sd(Fitted)) %>%
arrange(desc(fittedsd)) %>% head(56*2) %>% pull(Well)
observedplot = ggplot(liq_fit %>% filter(key=="Observed", Well %in% selectwells), aes(x=Fitted, y=value)) +
geom_point(aes(color=source), alpha=0.5) +
geom_smooth(color=NA, alpha=0.5) +
facet_wrap(~Well, scales="free") +
# geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b)) +
labs(title="Linear Regression Fit Plots Per Well", x="Fitted mass flow (T/h)", y="Observed mass flow (T/h)", color="Data source") +
theme(legend.position = "bottom")# +
# guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
# ggsave('../_media/observed.png', width=24.7, height=24.7, units='cm')
stdresplot = ggplot(liq_fit %>% filter(key=="Standardised residual", Well %in% selectwells), aes(x=Fitted, y=value)) +
geom_point(aes(color=source), alpha=0.5) +
geom_smooth(color=NA, alpha=0.5) +
facet_wrap(~Well, scales="free_x") +
geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
# geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
labs(title="Linear Regression Residual Plots Per Well", x="Fitted mass flow (T/h)", y="Standardised residual", color="Data source") +
coord_cartesian(ylim=c(-5, 5)) + theme(legend.position="bottom")# +
# guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
# ggsave('../_media/stdres.png', width=24.7, height=24.7, units='cm')
ggplotly(observedplot)## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplotly(stdresplot)## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# stdres_min = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% min()
# stdres_max = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% max()
# ggplot(liq_fit %>% filter(key=="Standardised residual"), aes(x=value)) +
# geom_density(fill="red", alpha=0.5, color=NA) +
# geom_line(data=data.frame(x=seq(stdres_min, stdres_max, length.out=100)), aes(x=x, y=dnorm(x)))6.5 Limits and Constraint Violations
sf.df <- outframe %>%
filter(str_detect(variable, "total_sf") & value > 0) %>%
droplevels()
limits = fp_constants %>%
mutate(facility = names(ids)[fp_id]) %>%
select(facility, limit) %>%
drop_na()
p.limits = sf.df %>%
left_join(limits, by=c("facility")) %>%
mutate(greater = value > limit) %>%
group_by(facility) %>%
summarise(p.greater = mean(greater)) %>%
drop_na()
limitplot = ggplot(sf.df %>% filter(facility %ni% incomplete.fps), aes(x=value, fill=facility)) +
facet_wrap(~facility, scales = "free_y", ncol=2) +
geom_density(alpha=0.5, color=NA) +
geom_vline(data=limits, aes(xintercept=limit), color="red") +
geom_label(data=p.limits %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>lim)=", p.greater), family="Times New Roman"), color="black", label.size=0, fill='white') +
theme(legend.position="none",
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
labs(title="Posterior Flash Plant Mass Flows", x="Steam flow (T/h)", y="Density", fill="Flash plant", color="Steam flow limit")# +
# ggsave('../_media/constraints.png', width=24.7, height=10, units='cm')
ggplotly(limitplot)6.6 Flow Comparison
flow.df <- outframe %>%
filter(facility %in% fp_names) %>%
filter(str_detect(variable, "mf_pred|ip_sf|lp_sf|wf") & value > 0) %>%
mutate(variable=ifelse(variable=="mf_pred", "mf", variable),
variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf")))
comparison = fp_constants %>% select("fp", contains("verification")) %>%
rename(facility=fp) %>%
gather(key="variable", value="value", -facility) %>%
mutate(variable = gsub("^verification_", "", variable),
variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
drop_na()
ps = flow.df %>%
left_join(comparison, by=c("facility", "variable")) %>%
mutate(greater = value.x > value.y) %>%
group_by(facility, variable) %>%
summarise(p.greater = mean(greater)) %>%
mutate(variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
drop_na()
verificationplot = ggplot(flow.df %>% filter(facility %ni% incomplete.fps), aes(x=value)) +
geom_density(aes(y=..scaled.., fill=variable, color=variable), alpha=0.5, show.legend=F) +
geom_vline(data=comparison %>% filter(facility %ni% incomplete.fps), aes(xintercept=value)) +
geom_label(data=ps %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>x)=", p.greater), family="Times New Roman"), label.size=0) +
facet_grid(facility~variable, scales="free", space="free_y") +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
labs(x="Value", y="Scaled density", title="Comparison Between Predicted FP Flows and Sample Data")# +
# ggsave('../_media/verification.png', width=24.7, height=20, units='cm')
ggplotly(verificationplot)